In [27]:
import boto
import pandas as pd
import numpy as np
import cufflinks as cf
import scipy.stats as stats
from scipy.stats import chi2_contingency
import plotly.graph_objs as go
import plotly.offline as py
from plotly import tools
import statsmodels.formula.api as sm

cf.go_offline()
py.init_notebook_mode()

c = boto.connect_s3()
b = c.get_bucket('nichung-datasets')

Load CSV and assign to dataframe

In [3]:
mergeNY13_df = pd.read_csv('s3:/nichung-datasets/ss13ny.csv')
In [5]:
mergeNY14_df = pd.read_csv('s3:/nichung-datasets/ss14ny.csv')
In [7]:
mergeNY15_df = pd.read_csv('s3:/nichung-datasets/ss15ny.csv')
In [8]:
ny_puma_df = pd.read_csv('s3:/nichung-datasets/puma_ny.csv')

r a c e & o t h e r s e s i n d i c a t o r s

Questions (test):

  • what does internet access look like against health insurance coverage type, by age group?
  • what does internet access look like against health insurance coverage type, by age group?

In [9]:
#health insurance coverage

#by age
coverage_by_age_13 = pd.crosstab(mergeNY13_df.HICOV, mergeNY13_df.age_group_13, margins=True)
coverage_by_age_13.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65', 'rowtotal']
coverage_by_age_13.index= ["yes","no","coltotal"] 

coverage_by_age_13
Out[9]:
<18 18-24 25-34 35-44 45-54 55-64 >65 rowtotal
yes 13296 19113 20153 26300 24851 41989 31037 176739
no 2575 4504 3258 2968 2025 1996 208 17534
coltotal 15871 23617 23411 29268 26876 43985 31245 194273
In [10]:
#by race
coverage_by_race_13 = pd.crosstab(mergeNY13_df.RAC1P, mergeNY13_df.HICOV, margins=True)
coverage_by_race_13.columns= ["yes","no","rowtotal"] 
coverage_by_race_13.index = ['white', 'black', 'american indian', 'alaska native', 'catch-all native', 'asian', 'pacific islander', 'other', '2+ races', 'coltotal']

insured_by_race_13 = coverage_by_race_13/coverage_by_race_13.ix['coltotal', 'rowtotal']

insured_by_race_13_no_total = insured_by_race_13.ix[0:9,0:2]
insured_by_race_13_no_total
Out[10]:
yes no
white 0.659995 0.049229
black 0.115377 0.015268
american indian 0.001861 0.000464
alaska native 0.000005 0.000010
catch-all native 0.001081 0.000163
asian 0.064313 0.009563
pacific islander 0.000408 0.000046
other 0.043586 0.011995
2+ races 0.023628 0.003008
In [11]:
insured_by_race_13_no_total.iplot(kind='bar', barmode='stack', filename='charts/health-coverage-by-race.html')
In [12]:
#by race
coverage_by_race_13 = pd.crosstab(mergeNY13_df.HICOV, mergeNY13_df.RAC1P, margins=True)
coverage_by_race_13.columns = ['white', 'black', 'american indian', 'alaska native', 'catch-all native', 'asian', 'pacific islander', 'other', '2+ races', 'rowtotal']
coverage_by_race_13.index = ["yes","no","coltotal"]

coverage_by_race_13_no_total = coverage_by_race_13.ix[0:2,0:9]
coverage_by_race_13_no_total
Out[12]:
white black american indian alaska native catch-all native asian pacific islander other 2+ races
yes 129468 22633 365 1 212 12616 80 8550 4635
no 9657 2995 91 2 32 1876 9 2353 590
In [13]:
coverage_by_race_13_no_total.iplot(kind='bar', barmode='stack', filename='charts/health-coverage-by-race.html')
In [14]:
#private coverage
privcov_by_age_13 = pd.crosstab(mergeNY13_df.PRIVCOV, mergeNY13_df.age_group_13, margins=True)
privcov_by_age_13.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65', 'rowtotal']
privcov_by_age_13.index= ["yes","no","coltotal"]

#public coverage
pubcov_by_age_13 = pd.crosstab(mergeNY13_df.PUBCOV, mergeNY13_df.age_group_13, margins=True)
pubcov_by_age_13.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65', 'rowtotal']
pubcov_by_age_13.index= ["yes","no","coltotal"]

#internet access
access_by_age_13 = pd.crosstab(mergeNY13_df.ACCESS, mergeNY13_df.age_group_13, margins=True)
access_by_age_13.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65', 'rowtotal']
access_by_age_13.index= ["yes","kinda", "no","coltotal"]
In [15]:
pubcov_by_age_13 = pd.crosstab(mergeNY13_df.PUBCOV, mergeNY13_df.age_group_13, margins=True);
pubcov_by_age_13.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65', 'rowtotal'];
pubcov_by_age_13.index= ["yes","no","coltotal"];
access_by_age_13 = pd.crosstab(mergeNY13_df.ACCESS, mergeNY13_df.age_group_13, margins=True);
access_by_age_13.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65', 'rowtotal'];
access_by_age_13.index= ["yes","kinda", "no","coltotal"];
In [16]:
access_by_age_13 = pd.crosstab(mergeNY13_df.ACCESS, mergeNY13_df.age_group_13, margins=True);
access_by_age_13.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65', 'rowtotal'];
access_by_age_13.index= ["yes","kinda", "no","coltotal"];
In [17]:
# select crosstab by column and convert to percentile per age group
privc_less18_13 = privcov_by_age_13['<18'];
private_less18_13 = privc_less18_13/privc_less18_13.ix['coltotal', 'rowtotal'];
yes_private_less18_13 = private_less18_13[0];

privc_1824_13 = privcov_by_age_13['18-24'];
private_1824_13 = privc_1824_13/privc_1824_13.ix['coltotal', 'rowtotal'];
yes_private_1824_13 = private_1824_13[0];

privc_2534_13 = privcov_by_age_13['25-34'];
private_2534_13 = privc_2534_13/privc_2534_13.ix['coltotal', 'rowtotal'];
yes_private_2534_13 = private_2534_13[0];

privc_3544_13 = privcov_by_age_13['35-44'];
private_3544_13 = privc_3544_13/privc_3544_13.ix['coltotal', 'rowtotal'];
yes_private_3544_13 = private_3544_13[0];

privc_4554_13 = privcov_by_age_13['45-54'];
private_4554_13 = privc_4554_13/privc_4554_13.ix['coltotal', 'rowtotal'];
yes_private_4554_13 = private_4554_13[0];

privc_5564_13 = privcov_by_age_13['55-64'];
private_5564_13 = privc_5564_13/privc_5564_13.ix['coltotal', 'rowtotal'];
yes_private_5564_13 = private_5564_13[0];

privc_more65_13 = privcov_by_age_13['>65'];
private_more65_13 = privc_more65_13/privc_more65_13.ix['coltotal', 'rowtotal'];
yes_private_more65_13 = private_more65_13[0];

# select crosstab by column and convert to percentile per age group
pubc_less18_13 = pubcov_by_age_13['<18'];
public_less18_13 = pubc_less18_13/pubc_less18_13.ix['coltotal', 'rowtotal'];
yes_public_less18_13 = public_less18_13[0];

pubc_1824_13 = pubcov_by_age_13['18-24'];
public_1824_13 = pubc_1824_13/pubc_1824_13.ix['coltotal', 'rowtotal'];
yes_public_1824_13 = public_1824_13[0];

pubc_2534_13 = pubcov_by_age_13['25-34'];
public_2534_13 = pubc_2534_13/pubc_2534_13.ix['coltotal', 'rowtotal'];
yes_public_2534_13 = public_2534_13[0];

pubc_3544_13 = pubcov_by_age_13['35-44'];
public_3544_13 = pubc_3544_13/pubc_3544_13.ix['coltotal', 'rowtotal'];
yes_public_3544_13 = public_3544_13[0];

pubc_4554_13 = pubcov_by_age_13['45-54'];
public_4554_13 = pubc_4554_13/pubc_4554_13.ix['coltotal', 'rowtotal'];
yes_public_4554_13 = public_4554_13[0];

pubc_5564_13 = pubcov_by_age_13['55-64'];
public_5564_13 = pubc_5564_13/pubc_5564_13.ix['coltotal', 'rowtotal'];
yes_public_5564_13 = public_5564_13[0];

pubc_more65_13 = pubcov_by_age_13['>65'];
public_more65_13 = pubc_more65_13/pubc_more65_13.ix['coltotal', 'rowtotal'];
yes_public_more65_13 = public_more65_13[0];

# select crosstab by column and convert to percentile per age group
iacc_less18_13 = access_by_age_13['<18'];
access_less18_13 = iacc_less18_13/iacc_less18_13.ix['coltotal', 'rowtotal'];
yes_access_less18_13 = access_less18_13[0];

iacc_1824_13 = access_by_age_13['18-24'];
access_1824_13 = iacc_1824_13/iacc_1824_13.ix['coltotal', 'rowtotal'];
yes_access_1824_13 = access_1824_13[0];

iacc_2534_13 = access_by_age_13['25-34'];
access_2534_13 = iacc_2534_13/iacc_2534_13.ix['coltotal', 'rowtotal'];
yes_access_2534_13 = access_2534_13[0];

iacc_3544_13 = access_by_age_13['35-44'];
access_3544_13 = iacc_3544_13/iacc_3544_13.ix['coltotal', 'rowtotal'];
yes_access_3544_13 = access_3544_13[0];

iacc_4554_13 = access_by_age_13['45-54'];
access_4554_13 = iacc_4554_13/iacc_4554_13.ix['coltotal', 'rowtotal'];
yes_access_4554_13 = access_4554_13[0];

iacc_5564_13 = access_by_age_13['55-64'];
access_5564_13 = iacc_5564_13/iacc_5564_13.ix['coltotal', 'rowtotal'];
yes_access_5564_13 = access_5564_13[0];

iacc_more65_13 = access_by_age_13['>65'];
access_more65_13 = iacc_more65_13/iacc_more65_13.ix['coltotal', 'rowtotal'];
yes_access_more65_13 = access_more65_13[0];
In [18]:
# Create and style traces
yes_acc_13 = go.Bar(
    x=['< 18', '18-24', '25-34', '35-44', '45-54', '55-64', '> 64'],
    y=[yes_access_less18_13, yes_access_1824_13, yes_access_2534_13, yes_access_3544_13, yes_access_4554_13, yes_access_5564_13, yes_access_more65_13],
    name = 'internet access'
);

yes_privc_13 = go.Bar(
    x=['< 18', '18-24', '25-34', '35-44', '45-54', '55-64', '> 64'],
    y=[yes_private_less18_13, yes_private_1824_13, yes_private_2534_13, yes_private_3544_13, yes_private_4554_13, yes_private_5564_13, yes_private_more65_13],
    name = 'private health insurance'
);

yes_pubc_13 = go.Bar(
    x=['< 18', '18-24', '25-34', '35-44', '45-54', '55-64', '> 64'],
    y=[yes_public_less18_13, yes_public_1824_13, yes_public_2534_13, yes_public_3544_13, yes_public_4554_13, yes_public_5564_13, yes_public_more65_13],
    name = 'public health insurance'
);

data = [yes_acc_13, yes_pubc_13, yes_privc_13]

# Edit the layout
layout = go.Layout(
    title = 'Internet Access and Health Insurance by Type for New Yorkers by Age, 2013-15',
    xaxis = dict(title = 'Age Group'),
    yaxis = dict(title = 'Percent of Population'),
    barmode = 'group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='internet-access-and-health-insurance-type-by-age-13')

race eda

Questions:

  • distribution of race?
  • internet access type by race?
In [19]:
# chi-squared goodness-of-fit test for categorical variable 'RAC1P'
# todo
In [20]:
#race by age group
race_by_age_13 = pd.crosstab(mergeNY13_df.RAC1P, mergeNY13_df.age_group_13, margins=True);
race_by_age_13.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65', 'rowtotal'];
race_by_age_13.index= ['white', 'black', 'american indian', 'alaska native', 'catch-all native', 'asian', 'pacific islander', 'other', '2+ races', 'coltotal'];
race_by_age_13
Out[20]:
<18 18-24 25-34 35-44 45-54 55-64 >65 rowtotal
white 10167 15445 15822 21520 20756 28636 25524 137870
black 2454 3324 3186 3715 3107 6344 3266 25396
american indian 35 55 49 75 65 124 51 454
alaska native 0 0 1 1 0 1 0 3
catch-all native 26 30 23 39 18 81 22 239
asian 1417 2254 2217 2078 1743 3263 1381 14353
pacific islander 11 16 14 15 7 21 4 88
other 1215 1819 1577 1333 805 3344 678 10771
2+ races 546 674 522 492 375 2171 319 5099
coltotal 15871 23617 23411 29268 26876 43985 31245 194273
In [21]:
# chi-squared test of independence to test null hypothesis 
# that there is no association between race and age 
observed = race_by_age_13.ix[0:9,0:7]

observed
Out[21]:
<18 18-24 25-34 35-44 45-54 55-64 >65
white 10167 15445 15822 21520 20756 28636 25524
black 2454 3324 3186 3715 3107 6344 3266
american indian 35 55 49 75 65 124 51
alaska native 0 0 1 1 0 1 0
catch-all native 26 30 23 39 18 81 22
asian 1417 2254 2217 2078 1743 3263 1381
pacific islander 11 16 14 15 7 21 4
other 1215 1819 1577 1333 805 3344 678
2+ races 546 674 522 492 375 2171 319
In [22]:
# expected
expected = np.outer(race_by_age_13['rowtotal'][0:9],
                   race_by_age_13.ix['coltotal'][0:7]) / 194273

expected = pd.DataFrame(expected)

expected.columns = ['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '>65']
expected.index= ['white', 'black', 'american indian', 'alaska native', 'catch-all native', 'asian', 'pacific islander', 'other', '2+ races']

expected
Out[22]:
<18 18-24 25-34 35-44 45-54 55-64 >65
white 11263 16760 16614 20770 19073 31214 22173
black 2074 3087 3060 3826 3513 5749 4084
american indian 37 55 54 68 62 102 73
alaska native 0 0 0 0 0 0 0
catch-all native 19 29 28 36 33 54 38
asian 1172 1744 1729 2162 1985 3249 2308
pacific islander 7 10 10 13 12 19 14
other 879 1309 1297 1622 1490 2438 1732
2+ races 416 619 614 768 705 1154 820
In [29]:
stats.chi2_contingency(observed=observed)
Out[29]:
(5559.9917683424082,
 0.0,
 48,
 array([[  1.12631955e+04,   1.67603104e+04,   1.66141181e+04,
           2.07706638e+04,   1.90731297e+04,   3.12148984e+04,
           2.21736842e+04],
        [  2.07470887e+03,   3.08729124e+03,   3.06036225e+03,
           3.82600839e+03,   3.51331835e+03,   5.74986262e+03,
           4.08444828e+03],
        [  3.70892198e+01,   5.51909838e+01,   5.47095788e+01,
           6.83969054e+01,   6.28069984e+01,   1.02789322e+02,
           7.30169916e+01],
        [  2.45082950e-01,   3.64698131e-01,   3.61517040e-01,
           4.51961930e-01,   4.15024218e-01,   6.79224596e-01,
           4.82491134e-01],
        [  1.95249417e+01,   2.90542844e+01,   2.88008576e+01,
           3.60063004e+01,   3.30635961e+01,   5.41115595e+01,
           3.84384603e+01],
        [  1.17255853e+03,   1.74483742e+03,   1.72961803e+03,
           2.16233653e+03,   1.98561420e+03,   3.24963688e+03,
           2.30839841e+03],
        [  7.18909987e+00,   1.06978118e+01,   1.06044999e+01,
           1.32575499e+01,   1.21740437e+01,   1.99239215e+01,
           1.41530733e+01],
        [  8.79929486e+02,   1.30938786e+03,   1.29796668e+03,
           1.62269398e+03,   1.49007529e+03,   2.43864271e+03,
           1.73230400e+03],
        [  4.16559321e+02,   6.19865257e+02,   6.14458463e+02,
           7.68184627e+02,   7.05402830e+02,   1.15445541e+03,
           8.20074097e+02]]))
In [24]:
# chi-squared test of independence to test null hypothesis 
# that there is no association between access and race 
access_by_race_13 = pd.crosstab(mergeNY13_df.RAC1P, mergeNY13_df.ACCESS, margins=True);
access_by_race_13.columns = ['yes', 'yes w/o sub', 'no', 'rowtotal'];
access_by_race_13.index= ['white', 'black', 'american indian', 'alaska native', 'catch-all native', 'asian', 'pacific islander', 'other', '2+ races', 'coltotal'];
access_by_race_13
Out[24]:
yes yes w/o sub no rowtotal
white 109408 4306 18614 132328
black 17429 1103 4612 23144
american indian 298 36 104 438
alaska native 3 0 0 3
catch-all native 140 4 90 234
asian 12386 361 1205 13952
pacific islander 56 5 20 81
other 8000 455 1850 10305
2+ races 4023 243 590 4856
coltotal 151743 6513 27085 185341
In [25]:
race_age_observed = access_by_race_13.ix[0:9,0:3]

race_age_observed
Out[25]:
yes yes w/o sub no
white 109408 4306 18614
black 17429 1103 4612
american indian 298 36 104
alaska native 3 0 0
catch-all native 140 4 90
asian 12386 361 1205
pacific islander 56 5 20
other 8000 455 1850
2+ races 4023 243 590
In [28]:
chi2_contingency(race_age_observed)
stats.chi2_contingency(race_age_observed)
Out[28]:
(1547.6352059452531,
 0.0,
 16,
 array([[  1.08340020e+05,   4.65008964e+03,   1.93378901e+04],
        [  1.89485327e+04,   8.13294803e+02,   3.38217254e+03],
        [  3.58600817e+02,   1.53915971e+01,   6.40075860e+01],
        [  2.45616998e+00,   1.05421898e-01,   4.38408123e-01],
        [  1.91581258e+02,   8.22290805e+00,   3.41958336e+01],
        [  1.14228278e+04,   4.90282107e+02,   2.03889005e+03],
        [  6.63165894e+01,   2.84639125e+00,   1.18370193e+01],
        [  8.43694388e+03,   3.62124220e+02,   1.50593190e+03],
        [  3.97572047e+03,   1.70642912e+02,   7.09636616e+02]]))
In [30]:
age = mergeNY13_df.AGEP
fam_income = mergeNY13_df.FINCP
race = mergeNY13_df.RAC1P
In [31]:
result = sm.ols(formula="age ~ fam_income + race", data=mergeNY13_df).fit()
print result.params
Intercept     4.083207e+01
fam_income    5.646131e-07
race         -1.286298e+00
dtype: float64
In [32]:
print result.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    age   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     1361.
Date:                Tue, 14 Mar 2017   Prob (F-statistic):               0.00
Time:                        16:50:35   Log-Likelihood:            -7.0019e+05
No. Observations:              154073   AIC:                         1.400e+06
Df Residuals:                  154070   BIC:                         1.400e+06
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     40.8321      0.098    416.281      0.000      40.640      41.024
fam_income  5.646e-07   5.12e-07      1.102      0.270    -4.4e-07    1.57e-06
race          -1.2863      0.025    -51.878      0.000      -1.335      -1.238
==============================================================================
Omnibus:                    31812.229   Durbin-Watson:                   1.270
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6355.232
Skew:                           0.090   Prob(JB):                         0.00
Kurtosis:                       2.021   Cond. No.                     2.63e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.63e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [66]:
mergeNY14_df.GRPIP.describe()
Out[66]:
count     203640
unique       102
top             
freq      143089
Name: GRPIP, dtype: object
In [64]:
mergeNY13_df.GRPIP.iplot(kind='box', filename='rent-of-income-13')
In [70]:
grpip_14 = mergeNY14_df.GRPIP.apply(pd.to_numeric, errors='coerce')
In [71]:
grpip_14.iplot(kind='box', filename='rent-of-income-14')
In [65]:
mergeNY15_df.GRPIP.iplot(kind='box', filename='rent-of-income-15')
In [80]:
mergeNY13_df.FINCP.iplot(kind='box', filename='fam_income_13')
In [79]:
mergeNY13_df.HINCP.iplot(kind='box', filename='hh_income_13')
In [78]:
mergeNY14_df.FINCP.iplot(kind='box', filename='fam_income_14')
In [77]:
mergeNY15_df.FINCP.iplot(kind='box', filename='fam_income_15')
In [ ]: